In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc 
import seaborn as sns

%matplotlib inline
In [2]:
# Comprovar les versions
print("Pandas ver. {}".format(pd.__version__))
print("Numpy ver. {}".format(np.__version__))
print("Scipy ver. {}".format(sc.__version__))
Pandas ver. 2.0.3
Numpy ver. 1.24.3
Scipy ver. 1.11.1

Càrrega del conjunt de dades¶

In [3]:
data = pd.read_csv('Invistico_Airline.csv')

# Mostrem els primers registres
data.head()
Out[3]:
satisfaction Gender Customer Type Age Type of Travel Class Flight Distance Seat comfort Departure/Arrival time convenient Food and drink ... Online support Ease of Online booking On-board service Leg room service Baggage handling Checkin service Cleanliness Online boarding Departure Delay in Minutes Arrival Delay in Minutes
0 satisfied Female Loyal Customer 65 Personal Travel Eco 265 0 0 0 ... 2 3 3 0 3 5 3 2 0 0.0
1 satisfied Male Loyal Customer 47 Personal Travel Business 2464 0 0 0 ... 2 3 4 4 4 2 3 2 310 305.0
2 satisfied Female Loyal Customer 15 Personal Travel Eco 2138 0 0 0 ... 2 2 3 3 4 4 4 2 0 0.0
3 satisfied Female Loyal Customer 60 Personal Travel Eco 623 0 0 0 ... 3 1 1 0 1 4 1 3 0 0.0
4 satisfied Female Loyal Customer 70 Personal Travel Eco 354 0 0 0 ... 4 2 2 0 2 4 2 5 0 0.0

5 rows × 23 columns

In [4]:
# Mostrem el nombre de registres del dataset
data.shape[0]
Out[4]:
129880
In [5]:
# Es mostra el nombre d'atributs del conjunt de dades
num_atributs = len(data.columns)
num_atributs
Out[5]:
23
In [6]:
# Es mostra el nom dels atributs del dataset
for col in data.columns:
    print(col)
satisfaction
Gender
Customer Type
Age
Type of Travel
Class
Flight Distance
Seat comfort
Departure/Arrival time convenient
Food and drink
Gate location
Inflight wifi service
Inflight entertainment
Online support
Ease of Online booking
On-board service
Leg room service
Baggage handling
Checkin service
Cleanliness
Online boarding
Departure Delay in Minutes
Arrival Delay in Minutes
In [7]:
# Obtenim el recompte de valors nuls a cada columna
missing_values = data.isnull().sum()
print(missing_values)
satisfaction                           0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Seat comfort                           0
Departure/Arrival time convenient      0
Food and drink                         0
Gate location                          0
Inflight wifi service                  0
Inflight entertainment                 0
Online support                         0
Ease of Online booking                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Cleanliness                            0
Online boarding                        0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
dtype: int64
In [8]:
#Eliminem els valors nuls de la columna Arrival Delay in Minutes
satAir = data.dropna(subset=["Arrival Delay in Minutes"])

# Tornem a veure el recompte de valors nuls a cada columna
missing_values = satAir.isnull().sum()
print(missing_values)
satisfaction                         0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Seat comfort                         0
Departure/Arrival time convenient    0
Food and drink                       0
Gate location                        0
Inflight wifi service                0
Inflight entertainment               0
Online support                       0
Ease of Online booking               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Cleanliness                          0
Online boarding                      0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
dtype: int64

Quin és el nivell de satisfacció global dels passatgers de la aerolínia?¶

In [10]:
freq = satAir["satisfaction"].value_counts()
print(freq)

plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Satisfaction")
plt.show()
satisfaction
satisfied       70882
dissatisfied    58605
Name: count, dtype: int64

Hi ha diferències significatives entre el nombre de passatgers homes i dones?¶

In [11]:
from pywaffle import Waffle

gender_counts = satAir['Gender'].value_counts()
gender_percentage = (gender_counts / len(satAir)) * 100

# Creem el Waffle chart
fig = plt.figure( 
    FigureClass=Waffle,
    rows=5,  # files de gent
    figsize=(11, 6),
    values=gender_percentage,
    labels=[f"Female ({gender_percentage['Female']:.2f}%)", f"Male ({gender_percentage['Male']:.2f}%)"], 
    colors=["#FF82AB", "#1E90FE"],  
    icons=['female', 'male'],  
    legend={'loc': 'lower center',
            'bbox_to_anchor': (0.5, -0.5),
            'ncol': len(gender_counts),
            'framealpha': 0,
            'fontsize': 20
            },
    icon_size=30,
    icon_legend=True,
    title={'label': 'Distribució segons el gènere',
           'loc': 'center',
           'fontdict': {'fontsize': 20}
           }
)

plt.show()
In [12]:
def calculate_percentage_cross_tab_with_style(df, x):
    # Creem la tabulació creuada
    cross_tab = pd.crosstab(df[x], df['satisfaction'])
    
    # Convertim els counts en percentages i ajustem els decimals
    percentage_cross_tab = cross_tab.apply(lambda row: row / row.sum() * 100, axis=1)
    rounded_percentage_cross_tab = percentage_cross_tab.round(2)
    
    # Apliquem .style.background_gradient
    styled_percentage_cross_tab = rounded_percentage_cross_tab.style.background_gradient(cmap='Blues')
    
    return styled_percentage_cross_tab

calculate_percentage_cross_tab_with_style(satAir, 'Gender')
Out[12]:
satisfaction dissatisfied satisfied
Gender    
Female 34.860000 65.140000
Male 55.970000 44.030000

Existeix algun grup d'edat predominant?¶

In [15]:
import plotly.express as px

age_bins = [0, 20, 30, 40, 50, 60, 100]
age_labels = ["0-20", "21-30", "31-40", "41-50", "51-60", "61+"]

satAir.loc[:, 'Age Group'] = pd.cut(satAir['Age'], bins=age_bins, labels=age_labels, right=False)

age_group_counts = satAir['Age Group'].value_counts().reset_index()
age_group_counts.columns = ['Age Group', 'Count']
age_group_counts.style.background_gradient(cmap='Blues')
Out[15]:
  Age Group Count
0 41-50 29555
1 21-30 26098
2 31-40 25623
3 51-60 23902
4 61+ 12168
5 0-20 12141
In [16]:
fig = px.bar(
    age_group_counts,
    x='Age Group',
    y='Count',
    title='Grups edat',
    labels={'Count': 'Number of Customers'},
    color='Age Group',
    color_discrete_sequence=px.colors.sequential.Blues[::-1], 
)

fig.update_layout(
    xaxis_title='Grup edat',
    yaxis_title='Nombre de clients',
    font=dict(size=12),
    title_font=dict(size=16),
    showlegend=False,
    plot_bgcolor='#FFFFFF', 
    margin=dict(l=40, r=40, t=80, b=40),  
)

fig.show()
In [17]:
calculate_percentage_cross_tab_with_style(satAir, 'Age Group')
Out[17]:
satisfaction dissatisfied satisfied
Age Group    
0-20 56.680000 43.320000
21-30 55.490000 44.510000
31-40 49.950000 50.050000
41-50 34.270000 65.730000
51-60 33.300000 66.700000
61+ 52.230000 47.770000

Quin paper tenen el gènere i l'edat en la satisfacció dels passatgers?¶

In [18]:
satisfaction_by_gender_age = satAir.groupby(['Gender', 'Age Group'])['satisfaction'].value_counts(normalize=True).unstack()
satisfaction_by_gender_age.style.background_gradient(cmap='Blues')
Out[18]:
  satisfaction dissatisfied satisfied
Gender Age Group    
Female 0-20 0.323343 0.676657
21-30 0.485693 0.514307
31-40 0.419186 0.580814
41-50 0.269948 0.730052
51-60 0.241408 0.758592
61+ 0.321965 0.678035
Male 0-20 0.815895 0.184105
21-30 0.630902 0.369098
31-40 0.580095 0.419905
41-50 0.417214 0.582786
51-60 0.425130 0.574870
61+ 0.725497 0.274503

Existeixen diferències d'edat segons el gènere?¶

In [19]:
fig = px.histogram(satAir, x='Age', color='Gender', marginal='box', 
                   title='Distribució edat per gènere',
                   labels={'Age': 'Age'},
                   color_discrete_sequence=['#66B2FF', '#FF69B4']) 

fig.show()

Quants clients es classifiquen com a "Loyal" i "disloyal"?¶

In [20]:
freq = satAir["Customer Type"].value_counts()
print(freq)

plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Customer Type")
plt.show()
Customer Type
Loyal Customer       105773
disloyal Customer     23714
Name: count, dtype: int64

Influeix el tipus de client alhora de valorar la satisfacció?¶

In [25]:
g = sns.catplot(x="satisfaction", col="Customer Type", col_wrap=2, data=satAir, kind="count")

Quina és l'edat mitjana segons el tipus de client (Loyal vs. Disloyal)?¶

In [26]:
average_age_by_customer_type = satAir.groupby('Customer Type')['Age'].mean().reset_index()
average_age_by_customer_type.style.background_gradient(cmap='Blues')
Out[26]:
  Customer Type Age
0 Loyal Customer 41.463625
1 disloyal Customer 30.352534
In [27]:
fig = px.box(
    satAir, x='Customer Type', y='Age', color='Customer Type',
    title='Distribució Edat per Customer Type',
    labels={'Age': 'Age', 'Customer Type': 'Customer Type'},
    color_discrete_sequence=['#98F5FF', '#08306B']
)

fig.show()

Com es distribueix el tipus de viatge i la classe de viatge entre els clients?¶

In [28]:
freq = satAir["Type of Travel"].value_counts()
print(freq)

plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Type of Travel")
plt.show()
Type of Travel
Business travel    89445
Personal Travel    40042
Name: count, dtype: int64

Quina és la satisfacció segons el tipus de viatge?¶

In [29]:
calculate_percentage_cross_tab_with_style(satAir, 'Type of Travel')
Out[29]:
satisfaction dissatisfied satisfied
Type of Travel    
Business travel 41.630000 58.370000
Personal Travel 53.360000 46.640000
In [30]:
freq = satAir["Class"].value_counts()
print(freq)

plt.pie(freq, labels=freq.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4))
plt.title("Class")
plt.show()
Class
Business    61990
Eco         58117
Eco Plus     9380
Name: count, dtype: int64

Quina és la satisfacció segons la classe de viatge?¶

In [31]:
calculate_percentage_cross_tab_with_style(satAir, 'Class')
Out[31]:
satisfaction dissatisfied satisfied
Class    
Business 29.060000 70.940000
Eco 60.600000 39.400000
Eco Plus 57.280000 42.720000

Quin és el rang d'edat dels clients del nostre conjunt de dades?¶

In [32]:
import plotly.express as px

age_range_fig = px.histogram(satAir, x="Age", title="Distribució de les edats")
age_range_fig.update_traces(marker=dict(color='skyblue'))

age_range_fig.update_layout(
    title=dict(text="Distribució de les edats", x=0.5, y=0.95, xanchor='center', yanchor='top'),
    xaxis=dict(title="Age"),
    yaxis=dict(title="Count"),
    showlegend=False,  
    bargap=0.1,        
    plot_bgcolor='white', 
    font=dict(family="Arial", size=12),  
)

age_range_fig.show()

Com varia la distància de vol i la satisfacció entre els clients?¶

In [33]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,6))
axes = axes.ravel() # Canviem la matriu 2D en una matriu 1d contínua

sns.histplot(x='Flight Distance', data=satAir, ax=axes[0])
sns.kdeplot(data=satAir, x ="Flight Distance", hue="satisfaction", ax=axes[1], multiple="stack")
Out[33]:
<Axes: xlabel='Flight Distance', ylabel='Density'>

Quina és la distància mitjana de vol per a cada classe?¶

In [34]:
fig = px.box(satAir, x='Class', y='Flight Distance', color='Class',
             title='Distribució de la distància de vol per classe',
             labels={'Flight Distance': 'Distance'}            
)

fig.show()

Com varia la satisfacció segons el tipus i la classe de viatge en funció de la distància de vol?¶

In [35]:
g = sns.catplot(x="Flight Distance", y="Type of Travel", hue="satisfaction", col="Class", data=satAir, kind="bar", height=4.5, aspect=.8)

Com afecta el propòsit del viatge (personal o de negoci) segons l'edat?¶

In [36]:
average_age_by_purpose = satAir.groupby('Type of Travel')['Age'].mean().reset_index()

fig = px.bar(average_age_by_purpose, x='Type of Travel', y='Age',
             title='Edat mitjana segons el propòsit del viatge',
             labels={'Age', 'Type of Travel'},
             color='Type of Travel', 
             color_discrete_sequence=['#66B2FF', '#90EE90'],  
             )

fig.show()

Existeix correlació entre els retards (sortida i arribada) i el nivell de satisfacció?¶

In [37]:
fig = px.scatter(satAir, x='Departure Delay in Minutes', y='Arrival Delay in Minutes', color='satisfaction',
                 title='Satisfacció de Departure Delay i Arrival Delay',
                 labels={'Departure Delay in Minutes': 'Departure Delay', 'Arrival Delay in Minutes': 'Arrival Delay'},
                 color_discrete_sequence=['#90EE90', '#66B2FF'])  
fig.show()

Afecta un retard en l'arribada o sortida segons el tipus i class de viatge en la satisfacció?¶

In [38]:
g = sns.catplot(x="Class", y="Departure Delay in Minutes", hue="satisfaction", col="Type of Travel", data=satAir, kind="bar")
g = sns.catplot(x="Class", y="Arrival Delay in Minutes", hue="satisfaction", col="Type of Travel", data=satAir, kind="bar")

Serveis de la companyia de vol¶

In [39]:
num_vars = ["Seat comfort", "Departure/Arrival time convenient",
            "Food and drink", "Gate location", "Inflight wifi service", "Inflight entertainment",
            "Online support", "Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
            "Checkin service", "Cleanliness", "Online boarding"]

fig, ax = plt.subplots(7, 2, figsize = (15, 50))

for i, num_var in enumerate(num_vars):
    ax[i//2][i%2].hist(satAir[num_var], bins=15)
    ax[i//2][i%2].set_title(num_var)

Quins serveis influeixen més alhora de la satisfacció dels clients?¶

In [40]:
def create_grouped_bar_chart(x, y, df, color1, color2):
    # Percentatge de satisfacció
    satisfaction_percentage = (
        df.groupby([x, y]).size() /
        df.groupby([x]).size()
    ).reset_index(name='Percentage').round(4)

    satisfaction_percentage['Percentage'] *= 100

    # Creem un bar chart agrupat
    fig = px.bar(
        satisfaction_percentage,
        x=x,
        y='Percentage',
        color=y,
        barmode='group',
        title=f'{x} vs {y}',
        labels={'Percentage': 'Percentage of Customers'},
        color_discrete_sequence=[color1, color2],  
    )

    fig.show()
In [41]:
Services = ["Seat comfort", "Departure/Arrival time convenient",
            "Food and drink", "Gate location", "Inflight wifi service", "Inflight entertainment",
            "Online support", "Ease of Online booking", "On-board service", "Leg room service", "Baggage handling",
            "Checkin service", "Cleanliness", "Online boarding"]
In [42]:
dissatisfied_color = '#90EE90'
satisfied_color = '#66B2FF'

for i in range(len(Services)):
    create_grouped_bar_chart(Services[i], 'satisfaction', satAir, satisfied_color, dissatisfied_color)

Quines variables influeixen alhora de una major satisfacció?¶

In [43]:
satisfaction = {"satisfaction":     {"dissatisfied": 0, "satisfied": 1}}
satAir = satAir.replace(satisfaction)
satAir['satisfaction'].value_counts()
Out[43]:
satisfaction
1    70882
0    58605
Name: count, dtype: int64
In [65]:
corr_all = satAir.drop(columns=["Gender", "Customer Type", "Type of Travel", "Class","Age Group"]).corr()
plt.figure(figsize=(25, 25))
cmap = sns.diverging_palette(150, 1, as_cmap=True)
sns.heatmap(corr_all, cmap=cmap, vmax=None, center=0, square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .9})

plt.show()